support <- read_tsv(here::here("NYGC_ALS_RNA_seq_support.tsv")) %>% as.data.frame()

library_sizes <- read_tsv(here::here("STMN2-pipeline/results/all_nygc_nov_2019/all_nygc_nov_2019_library_sizes.tsv"),col_names = c("sample", "library_size") ) %>%
  mutate( sample = gsub("data/metrics/", "", sample) )

support <-
  support %>%
  # impute missing values
  mutate( rin = replace_na(rin, median(rin, na.rm = TRUE)),
          age = replace_na(age, median(age, na.rm = TRUE)),
          pmi = replace_na(pmi, median(pmi, na.rm = TRUE))) %>%
  left_join(library_sizes, by = "sample")

## fix disease == "Other" samples
# if disease_full contains ALS and FTD then mark as ALS-FTD
# assume sample with ALS and Dementia (NOS) is ALS-FTD
support <- 
  support %>%
  mutate( disease = case_when(
    disease == "Other" & grepl("ALS", disease_full) & grepl("FTD|NOS", disease_full) ~ "ALS-FTD",
    disease == "Other" & grepl("ALS", disease_full) & !grepl("FTD|NOS", disease_full) ~ "ALS",
    TRUE ~ disease
  ))

support <- 
  support %>%
  mutate( pathology = case_when(
    disease == "Control" ~ "Control",
    disease == "FTD" & mutations == "C9orf72" ~ "FTD-C9",
    disease == "FTD" & mutations != "C9orf72" & ( is.na(pathology) | grepl("TDP", pathology) ) ~ "FTD-TDP",
    mutations == c("SOD1", "ANG") ~ "ALS-other",
    mutations == "C9orf72" & disease == "ALS" ~ "ALS-C9",
    mutations != "C9orf72" & disease == "ALS" ~ "ALS",
    disease == "ALS-FTD" ~ "ALS-FTD",
    disease == "ALS-AD" ~ "ALS-AD",
    disease == "Other" ~ "Other",
    TRUE ~ pathology
  )) %>%
  filter( ! tissue %in% c("Cell_Line", "Liver")) %>% # ignored these
  mutate(tissue = gsub("_"," ", tissue)) %>%
  #filter(tissue %in% c("Cerebellum", "Frontal Cortex", "Temporal Cortex", "Lumbar Spinal Cord", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Lateral Motor Cortex", "Medial Motor Cortex", "Occipital Cortex") ) %>%
 # mutate( tissue = factor(tissue,levels = c("Cerebellum", "Frontal Cortex", "Temporal Cortex", "Lumbar Spinal Cord", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Lateral Motor Cortex", "Medial Motor Cortex", "Occipital Cortex") ) )%>%
 filter(!duplicated(sample)) # remove single duplicated sample - CGND-HRA-00431 

#table(support$disease, support$pathology)

# categorise into and ALS, FTD, ALS-FTD and +/- TDP-43
support <-
  support %>%
  mutate( tdp_status = case_when(
    disease == "Control" ~ "TDP-43 negative",
    pathology %in% c("ALS", "ALS-C9", "ALS-FTD", "ALS-AD", "FTD-C9", "FTD-TDP") ~ "TDP-43 positive",
    pathology %in% c("ALS-other","FTD-TAU", "FTD-FUS") ~ "TDP-43 negative",
    TRUE ~ "unknown"
  ) ) %>%
    mutate( disease_tdp = case_when(
    disease == "ALS" & tdp_status == "TDP-43 positive" ~ "ALS",
    disease == "ALS" & tdp_status == "TDP-43 negative" ~ "ALS-other",
    disease == "FTD" & tdp_status == "TDP-43 positive" ~ "FTD-TDP",
    disease == "FTD" & tdp_status == "TDP-43 negative" ~ "FTD-other",
    TRUE ~ pathology
  ))


#table(support$disease, support$tdp_status)
#table(support$pathology, support$tdp_status)

rownames(support) <- support$sample

table(support$pathology)
## 
##       ALS    ALS-AD    ALS-C9   ALS-FTD ALS-other   Control    FTD-C9 
##       565        64        86        92        16       183        45 
##   FTD-FUS   FTD-TAU   FTD-TDP     Other 
##        12        19        81        60
table(support$tissue, support$disease_tdp) %>% 
  knitr::kable() %>%
  kableExtra::kable_styling()
ALS ALS-AD ALS-FTD ALS-other Control FTD-other FTD-TDP Other
Cerebellum 140 10 17 6 31 11 50 10
Cervical Spinal Cord 97 9 13 0 27 0 1 8
Frontal Cortex 96 10 17 4 37 12 37 9
Hippocampus 7 0 3 0 0 0 0 1
Lateral Motor Cortex 46 6 6 2 9 0 1 5
Lumbar Spinal Cord 82 9 12 1 27 0 1 9
Medial Motor Cortex 52 5 5 0 11 0 1 7
Occipital Cortex 42 7 6 0 6 0 1 4
Other 1 0 0 0 0 0 0 0
Sensory Cortex 2 0 0 0 0 0 0 0
Temporal Cortex 22 3 3 0 26 8 33 2
Thoracic Spinal Cord 38 5 6 3 8 0 1 4
Unspecified Motor Cortex 26 0 4 0 1 0 0 1

For all samples used here, get junctions and cluster with Leafcutter. Can I find evidence of TDP-43 linked splicing events? Do they correlate with cellular composition?

STMN2 is both a neuronal marker and TDP-43 splicing target. Clustering junctions with leafcutter allows the quantification of both STMN2 expression and STMN2 mis-splicing. Do both of these variables correlate with cell-type composition?

# all CNS samples currently available
all_junctions <- 
  read.table(file = here::here("STMN2-pipeline/results/all_nygc_nov_2019/all_nygc_nov_2019_perind_numers.counts.gz"), header = TRUE, stringsAsFactors = FALSE)

meta <- leafcutter::get_intron_meta(row.names(all_junctions))

# find cluster containing novel junction
stmn2_clu <- filter(meta, chr == 'chr8', start >= 79611214, end <= 79636802) %>%
  pull(clu) %>%
  unique()

stmn2 <- all_junctions[ meta$clu == stmn2_clu, ]
stmn2 <- rownames_to_column(stmn2, var = "junction")

# reshape the table
stmn2 <- 
  tidyr::gather(stmn2, key = "sample", value = "counts", -junction) %>%
 # tidyr::spread(key = junction, value = counts) %>%
  mutate(sample = gsub(".Aligned.Quality.Sorted.bam", "", sample, fixed = TRUE) ) %>%
  mutate(sample = gsub(".", "-", sample, fixed = TRUE)) %>%
  inner_join(support, by = "sample") 

# annotate junctions
stmn2_meta <- leafcutter::get_intron_meta(stmn2$junction) %>%
  mutate( anno = case_when( start ==79611214 & end == 79616822 ~ "novel_5SS",
                            start == 79611214 & end == 79636802 ~ "major",
                            start == 79611791 ~ "novel 3SS A",
                            start == 79611433 ~ "novel 3SS B"))

stmn2$junction_type <- stmn2_meta$anno

# library size is in total reads 
# divide by 2 to get read pairs
# divide counts by total read pairs to normalise
# times by 1e6 get per million

stmn2$counts_tpm <- (stmn2$counts / (stmn2$library_size /2 )  ) * 1e6

# only plot tissues with 20+ samples
tissues_to_plot <-
  group_by(support, tissue ) %>% tally() %>%
  filter(n >= 20) %>%
  pull(tissue)

# for now exclude "other" and "other FTD" categories
stmn2_als_ftd <- 
  stmn2 %>%
   filter(pathology != "Other" ) %>%
  mutate( pathology = forcats::fct_relevel(pathology, "Control"))  %>%
  mutate( disease = forcats::fct_relevel(disease, "Control"))  %>%
  filter(tissue %in% tissues_to_plot) %>%
  mutate( tissue = gsub("_", " ", tissue )) %>%
  mutate(tissue = factor(tissue, levels = c("Frontal Cortex", "Temporal Cortex", "Cerebellum", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Lumbar Spinal Cord", "Lateral Motor Cortex", "Medial Motor Cortex", "Occipital Cortex") ))

STMN2 canonical junction

Make plots

Plot normalised junction counts of major junction and novel junction

# diagnostic plot
stmn2_als_ftd %>%
  filter( junction_type %in% c("major") ) %>%
  filter( disease %in% c("Control", "ALS", "FTD")) %>%
  filter(tissue %in% c("Frontal Cortex", "Cerebellum")) %>%
  #filter( platform %in% c("HiSeq 2500", "NovaSeq")) %>%
  ggplot(aes( x = disease, y = (counts_tpm), group = paste(disease, prep) )) + 
  geom_point(width = 0.05, aes(colour = prep), size = 0.5, alpha = 0.9, position = position_jitterdodge(jitter.width = 0.2, jitter.height = 0) ) + 
  facet_wrap(~tissue, ncol = 2) +
  labs(y = "junction counts per million", x = "", title = "STMN2 canonical junction", subtitle = "Effect of library prep method on counts") +
  scale_alpha_manual(values = c(1,0.1) ) +
  #scale_colour_manual(values = c("black", "gray")) +
  #guides(colour = FALSE, alpha = FALSE) +
  theme_jh() +
  geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
  theme(panel.border = element_rect(fill =NA))
## Warning: Ignoring unknown parameters: width

There is an effect of library prep interacting with disease status on the STMN2 major junction.

stmn2_als_ftd %>%
  filter( junction_type %in% c("major") ) %>%
  filter( disease %in% c("Control", "ALS", "FTD", "ALS-FTD")) %>%
  filter(tissue %in% c("Frontal Cortex", "Cerebellum")) %>%
  filter( !is.na(prep) ) %>%
  ggplot(aes( x = rin, y = (counts_tpm) )) + 
  geom_point(aes(colour = disease), size = 1) +
  #geom_jitter(width = 0.05, aes(colour = platform), size = 0.5, alpha = 0.9) + 
  facet_wrap(~ prep + tissue, ncol = 3) +
  labs(y = "junction counts per million", x = "", title = "STMN2 canonical junction correlates with RIN in a prep-specific effect") +
  scale_alpha_manual(values = c(1,0.1)) +
  #scale_colour_manual(values = c("black", "gray")) +
  #guides(colour = FALSE, alpha = FALSE) +
  theme_jh() +
  #geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
  theme(panel.border = element_rect(fill =NA)) +
  ggpubr::stat_cor()

# levels in all samples - HiSeq
stmn2_als_ftd %>%
  filter( junction_type %in% c("major") ) %>%
  filter(platform == "HiSeq 2500") %>%
  filter(tissue != "Temporal Cortex") %>%
  ggplot(aes( x = forcats::fct_rev(pathology), y = (counts_tpm) )) + 
  geom_jitter(width = 0.05, aes(colour = platform), size = 0.5, alpha = 0.9) + 
  facet_wrap(~ tissue, ncol = 3) +
  coord_flip() + 
  labs(y = "junction counts per million", x = "", title = "STMN2 canonical junction", subtitle =  "HiSeq samples") +
  scale_alpha_manual(values = c(1,0.1)) +
  #scale_colour_manual(values = c("black", "gray")) +
  guides(colour = FALSE, alpha = FALSE) +
  theme_jh() +
  geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
  theme(panel.border = element_rect(fill =NA))

stmn2_als_ftd %>%
  filter( junction_type %in% c("major") ) %>%
  filter(platform == "NovaSeq") %>%
  filter( tissue %in% c("Frontal Cortex", "Temporal Cortex", "Cerebellum", "Cervical Spinal Cord", "Lumbar Spinal Cord") )  %>%
  #filter(tissue != "Temporal Cortex") %>%
  ggplot(aes( x = forcats::fct_rev(pathology), y = (counts_tpm) )) + 
  geom_jitter(width = 0.05, colour = "black", size = 0.5, alpha = 0.9) + 
  facet_wrap(~ tissue, ncol = 3) +
  coord_flip() + 
  labs(y = "junction counts per million", x = "", title = "STMN2 canonical junction", subtitle =  "NovaSeq samples") +
  scale_alpha_manual(values = c(1,0.1)) +
  #scale_colour_manual(values = c("black", "gray")) +
  guides(colour = FALSE, alpha = FALSE) +
  theme_jh() +
  geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
  theme(panel.border = element_rect(fill =NA))

There is clearly an effect of library prep and/or sequencing platform on the quantification of STMN2 splicing. This is confounding by the fact that most ALS samples were sequenced in one batch while most of the FTD samples were sequenced in the other batch. However, some trends emerge. ALS samples have consistently higher levels of STMN2 major isoform than controls in the frontal cortex, whereas FTD samples have lower levels of STMN2 in the frontal and temporal cortices.

# # expression of major isoform
# stmn2_major_plot <- stmn2 %>%
#   left_join(stmn_rsem_voom, by = "sample") %>%
#   filter( junction_type %in% c("major isoform") ) %>%
#   ggplot(aes( x = forcats::fct_rev(pathology), y = stmn2_expression )) +  
#   geom_jitter(width = 0.1, colour = "black", size = 0.5, alpha = 0.9) + 
#   facet_wrap(~ tissue, ncol = 3, scale = "free_y") +
#   coord_flip() + 
#   labs(y = "normalised gene counts", x = "", title = "STMN2 canonical isoform") +
#   scale_alpha_manual(values = c(1,0.1)) +
#   #scale_colour_manual(values = c("black", "gray")) +
#   guides(colour = FALSE, alpha = FALSE) +
#   theme_jh() +
#   geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
#   theme_jh() +
#   theme(panel.border = element_rect(fill = NA), axis.line = element_blank())

# # expression of major isoform
# novaseq_only_stmn2_major_plot <- stmn2 %>%
#   left_join(stmn_rsem_voom, by = "sample") %>%
#   filter( junction_type %in% c("major isoform") ) %>%
#   filter(platform == "NovaSeq") %>%
#   ggplot(aes( x = forcats::fct_rev(pathology), y = stmn2_expression )) +  
#   geom_jitter(aes(colour = platform), width = 0.1, size = 0.5, alpha = 0.9) + 
#   facet_wrap(~ tissue, ncol = 3, scale = "free_y") +
#   coord_flip() + 
#   labs(y = "normalised gene counts", x = "", title = "STMN2 canonical isoform - NovaSeq samples only") +
#   scale_alpha_manual(values = c(1,0.1)) +
#   #scale_colour_manual(values = c("black", "gray")) +
#   guides(colour = FALSE, alpha = FALSE) +
#   theme_jh() +
#   geom_boxplot(fill = NA, colour = "black", outlier.colour = NA) +
#   theme_jh() +
#   theme(panel.border = element_rect(fill = NA), axis.line = element_blank())
# 
# 
# stmn2_major_plot
# 
# ggsave(stmn2_major_plot, filename = "plots/STMN2/stmn2_all_samples_major_isoform.pdf", width = 9, height = 7, device = cairo_pdf)

ALS samples have higher STMN2 expression - is this due to batch effect of controls being mostly done on NovaSeq?

STMN2 novel junction

Plot counts of STMN2 novel junction.

# plot counts
# stmn2_als_ftd %>%
#   filter( junction_type %in% c("novel_5SS")) %>%
#   ggplot(aes( x = forcats::fct_rev(pathology), y = (counts_tpm ) )) +
#   geom_jitter(width = 0.5, aes(colour = pathology, alpha = counts_tpm == 0), size =1) +
#   facet_wrap(~ tissue, ncol = 5, scales = "free") +
#   coord_flip() +
#   labs(y = "counts per million", x = "", title = "STMN2 novel isoform") +
#   scale_alpha_manual(values = c(1,0.1)) +
#   #scale_colour_manual(values = c("black", "gray")) +
#   guides(colour = FALSE, alpha = FALSE) +
#   theme_jh() +
#   theme(panel.border = element_rect(fill = NA), axis.line = element_blank())
# 
# # plot counts
# stmn2 %>%
#   filter( junction_type %in% c("novel_5SS")) %>%
#   ggplot(aes( x = forcats::fct_rev(pathology), y = (counts ) )) +
#   geom_jitter(width = 0.2, height = 0, aes(colour = pathology, alpha = counts_tpm == 0), size =1) +
#   facet_wrap(~ tissue, ncol = 5, scales = "free") +
#   coord_flip() +
#   labs(y = "raw junction counts", x = "", title = "STMN2 novel isoform") +
#   scale_alpha_manual(values = c(1,0.1)) +
#   #scale_colour_manual(values = c("black", "gray")) +
#   guides(colour = FALSE, alpha = FALSE) +
#   theme_jh() +
#   theme(panel.border = element_rect(fill = NA), axis.line = element_blank())

Bar Plots of STMN2 novel splicing

Treat as binary outcome

Try to plot multiple thresholds simultaneously eg >= 1, >=2, >=5 in the same chart maybe with patchwork?

# # treat as binary - counts > 0
# stmn2_proportions_1 <- 
#   stmn2 %>%
#   filter( junction_type %in% c("novel_5SS")) %>%
#     filter( tissue %in% tissues_to_plot) %>%
#     mutate( pathology = forcats::fct_relevel(pathology, "Control")) %>%
#   mutate(stmn2_status = counts > 0 ) %>%
#   group_by( pathology, tissue ) %>%
#   summarise( total = n(), stmn2_positive = sum(stmn2_status) ) %>%
#   mutate( prop = stmn2_positive / total )  %>%
#   mutate(stmn2_positive = as.character(stmn2_positive)) %>%
#   mutate(stmn2_positive = ifelse( stmn2_positive == "0", "", stmn2_positive))
# 
# 
# stmn2_proportions_2 <- 
#   stmn2 %>%
#   filter( junction_type %in% c("novel_5SS")) %>%
#   filter( tissue %in% tissues_to_plot) %>%
#     mutate( pathology = forcats::fct_relevel(pathology, "Control")) %>%
#   mutate(stmn2_status = counts > 1 ) %>%
#   group_by( pathology, tissue ) %>%
#   summarise( total = n(), stmn2_positive = sum(stmn2_status) ) %>%
#   mutate( prop = stmn2_positive / total ) %>%
#   mutate(stmn2_positive = as.character(stmn2_positive)) %>%
#   mutate(stmn2_positive = ifelse( stmn2_positive == "0", "", stmn2_positive))
# 
# 
# stmn2_prop_plot_1 <- 
#   stmn2_proportions_1 %>%
#   ggplot(aes( x = forcats::fct_rev(pathology), y = prop )) +
#   geom_col() + 
#   geom_text(aes(label = total), y = 1.05, size = 3) +
#   geom_text(data = filter(stmn2_proportions_1, prop < 0.9), aes(label = stmn2_positive ), nudge_y = 0.05, size = 3) +
#   geom_text(data = filter(stmn2_proportions_1, prop >= 0.9), aes(label = stmn2_positive ), nudge_y = -0.05, size = 3, colour = "white") +
#   scale_y_continuous(expand = c(0,0), limits = c(0,1.1),labels = scales::percent ) +
#   #geom_errorbar(aes(ymin = prop, ymax = binom_conf_upper)) +
#   facet_wrap(~tissue, scales = "free_y") +
#   geom_hline(yintercept = 1, linetype = 3) +
#   coord_flip() +
#   labs(y = "% of samples with 1 or more novel STMN2 junction read", x = "", title = "STMN2 novel splicing across disease and tissue") +
#   theme_jh() +
#   theme( panel.border = element_rect(fill = NA), axis.line = element_blank() )
# 
# tdp_status_support <- support %>%
#   select(disease, pathology, tdp_status) %>% 
#   distinct()
# 
# 
# stmn2_prop_plot_2 <- 
#   stmn2_proportions_2 %>%
#   #left_join(tdp_status_support, by = "pathology") %>%
#   ggplot(aes( x = forcats::fct_rev(pathology), y = prop )) +
#   geom_col() + 
#   geom_text(aes(label = total), y = 1.05, size = 3) +
#   geom_text(data = filter(stmn2_proportions_2, prop < 0.9), aes(label = stmn2_positive ), nudge_y = 0.05, size = 3) +
#   geom_text(data = filter(stmn2_proportions_2, prop >= 0.9), aes(label = stmn2_positive ), nudge_y = -0.05, size = 3, colour = "white") +
#   scale_y_continuous(expand = c(0,0), limits = c(0,1.1),labels = scales::percent ) +
#   #geom_errorbar(aes(ymin = prop, ymax = binom_conf_upper)) +
#   facet_wrap(~tissue, scales = "free_y") +
#   geom_hline(yintercept = 1, linetype = 3) +
#   coord_flip() +
#   labs(y = "% of samples with 2 or more novel STMN2 junction reads", x = "", title = "STMN2 novel splicing across disease and tissue") +
#   theme_jh() +
#   theme( panel.border = element_rect(fill = NA), axis.line = element_blank() )
# 
# 
# stmn2_prop_plot_1
# stmn2_prop_plot_2

Count samples at different read count thresholds

stmn2_novel <- filter(stmn2, junction_type %in% c("novel_5SS")) 

stmn2_read_thresholds <- 
  stmn2_novel %>%
  filter(tissue %in% c("Frontal Cortex", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Temporal Cortex", "Lumbar Spinal Cord", "Occipital Cortex", "Lateral Motor Cortex", "Medial Motor Cortex", "Cerebellum")) %>%
  mutate( stmn2_1 = counts == 1, stmn2_2 = counts >=2 & counts < 5, stmn2_5 = counts >= 5) %>%
  #group_by(tissue, pathology,TDP = tdp_status) %>%
  group_by(tissue, disease_tdp) %>%
  summarise( samples = n(), `1` = sum(stmn2_1) / n(), `2-4` = sum(stmn2_2) / n(), `5+` = sum(stmn2_5) / n()  ) %>%
  #tidyr::gather(key = "threshold", value = count, -tissue, -pathology, -TDP, -samples ) %>%
  tidyr::gather(key = "threshold", value = count, -tissue, -disease_tdp, -samples ) %>%
  ungroup() %>% 
  mutate( disease_tdp = forcats::fct_relevel(disease_tdp, "Control")) %>%
  mutate( disease_tdp = forcats::fct_rev(disease_tdp)) %>%
  mutate(tissue = factor(tissue, levels = c("Frontal Cortex", "Temporal Cortex", "Cerebellum", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Lumbar Spinal Cord", "Lateral Motor Cortex", "Medial Motor Cortex", "Occipital Cortex") ))

stmn2_threshold_plot <-
  stmn2_read_thresholds %>%
  ggplot(aes(x = disease_tdp, y = count ) ) +
  geom_col(aes(alpha = threshold), fill = "blue", colour = "white") +
  #facet_wrap(~Tissue) +
  #coord_flip() + 
  geom_text(data = select(stmn2_read_thresholds, tissue, disease_tdp, samples) %>% distinct(), aes(label = samples), y = 1.05, size = 3) +
  #geom_text(data = filter(stmn2_proportions_2, prop < 0.9), aes(label = stmn2_positive ), nudge_y = 0.05, size = 3) +
  #geom_text(data = filter(stmn2_proportions_2, prop >= 0.9), aes(label = stmn2_positive ), nudge_y = -0.05, size = 3, colour = "white") +
  scale_y_continuous(expand = c(0,0), limits = c(0,1.1),labels = scales::percent ) +
  #geom_errorbar(aes(ymin = prop, ymax = binom_conf_upper)) +
  facet_wrap(~tissue, scales = "free_y") +
  geom_hline(yintercept = 1, linetype = 3) +
  labs(x = "", y = "Proportion of samples with STMN2 junctions") + 
  coord_flip() +
  #labs(y = "% of samples with 2 or more novel STMN2 junction reads", x = "", title = "STMN2 novel splicing across disease and tissue") +
  theme_jh() +
  scale_alpha_manual("read\nthreshold",values = c(0.3,0.6,1) ) +
  theme( panel.border = element_rect(fill = NA), axis.line = element_blank() )

stmn2_threshold_plot

ggsave(plot = stmn2_threshold_plot, dpi = 600, width = 9, height = 6, filename = "../figures/STMN2_read_threshold_NYGC.png" ) 

  
  # mutate( tdp_status = ifelse(tdp_status == "TDP-43 negative", yes = "-other", no = "-TDP")) %>%
  # group_by(Tissue = tissue, Disease = disease,`TDP-43` = tdp_status) %>%
  # summarise( samples = n(), `≥1` = sum(stmn2_1), `≥2` = sum(stmn2_2), `>5` = sum(stmn2_5)  ) %>%
  # knitr::kable(align = "c") %>%
  # kableExtra::kable_styling(full_width = FALSE) %>%
  # kableExtra::add_header_above(c(" " = 4, "Samples with STMN2 novel junction reads" = 3)) %>%
  #  kableExtra::column_spec(4, bold = TRUE) %>%
  # kableExtra::collapse_rows(columns = 1:2)

Tabling STMN novel junctions at different thresholds

stmn2_novel %>%
  mutate( stmn2_1 = counts >= 1, stmn2_2 = counts >= 2, stmn2_5 = counts >= 5) %>%
    mutate( tdp_status = ifelse(tdp_status == "TDP-43 negative", yes = "\\-", no = "\\+")) %>%
  group_by(disease,tdp_status) %>%
  summarise( samples = n(), `≥1` = sum(stmn2_1), `≥2` = sum(stmn2_2), `>5` = sum(stmn2_5)  ) %>%
  knitr::kable(align = "c") %>%
  kableExtra::kable_styling(full_width = FALSE) %>%
  kableExtra::add_header_above(c(" " = 3, "Samples with STMN2 novel junction reads" = 3)) %>%
   kableExtra::column_spec(3, bold = TRUE) 
Samples with STMN2 novel junction reads
disease tdp_status samples ≥1 ≥2 >5
ALS - 15 0 0 0
ALS + 650 208 154 80
ALS-AD + 64 18 13 6
ALS-FTD + 92 49 40 21
Control - 183 5 0 0
FTD - 31 1 0 0
FTD + 126 64 51 37
Other - 1 0 0 0
Other + 60 6 1 1
# by tissue
stmn2_novel %>%
  mutate( disease = forcats::fct_relevel(disease, "Control")) %>%
  filter(tissue %in% c("Frontal Cortex", "Cervical Spinal Cord", "Thoracic Spinal Cord", "Temporal Cortex", "Lumbar Spinal Cord", "Occipital Cortex", "Lateral Motor Cortex", "Medial Motor Cortex", "Cerebellum", "Unspecified Motor Cortex")) %>%
  mutate( stmn2_1 = counts >= 1, stmn2_2 = counts >= 2, stmn2_5 = counts >= 5) %>%
  mutate( tdp_status = ifelse(tdp_status == "TDP-43 negative", yes = "\\-", no = "\\+")) %>%
  group_by(Tissue = tissue, Disease = disease,`TDP-43` = tdp_status) %>%
  summarise( samples = n(), `≥1` = sum(stmn2_1), `≥2` = sum(stmn2_2), `>5` = sum(stmn2_5)  ) %>%
  knitr::kable(align = "c") %>%
  kableExtra::kable_styling(full_width = FALSE) %>%
  kableExtra::add_header_above(c(" " = 4, "Samples with STMN2 novel junction reads" = 3)) %>%
   kableExtra::column_spec(4, bold = TRUE) %>%
  kableExtra::collapse_rows(columns = 1:2)
Samples with STMN2 novel junction reads
Tissue Disease TDP-43 samples ≥1 ≥2 >5
Cerebellum Control - 31 3 0 0
ALS - 5 0 0 0
+ 140 4 1 0
ALS-AD + 10 0 0 0
ALS-FTD + 17 2 0 0
FTD - 11 0 0 0
+ 50 1 0 0
Other - 1 0 0 0
+ 10 1 0 0
Cervical Spinal Cord Control - 27 1 0 0
ALS + 97 62 45 26
ALS-AD + 9 6 4 2
ALS-FTD + 13 10 9 4
FTD + 1 0 0 0
Other + 8 0 0 0
Frontal Cortex Control - 37 1 0 0
ALS - 4 0 0 0
+ 96 10 8 3
ALS-AD + 10 0 0 0
ALS-FTD + 17 11 10 5
FTD - 12 0 0 0
+ 37 36 30 21
Other + 9 1 0 0
Lateral Motor Cortex Control - 9 0 0 0
ALS - 2 0 0 0
+ 46 17 10 7
ALS-AD + 6 1 1 1
ALS-FTD + 6 2 1 1
FTD + 1 0 0 0
Other + 5 1 0 0
Lumbar Spinal Cord Control - 27 0 0 0
ALS - 1 0 0 0
+ 82 56 46 31
ALS-AD + 9 6 6 3
ALS-FTD + 12 10 8 6
FTD + 1 1 0 0
Other + 9 1 0 0
Medial Motor Cortex Control - 11 0 0 0
ALS + 52 16 13 3
ALS-AD + 5 3 1 0
ALS-FTD + 5 3 2 1
FTD + 1 0 0 0
Other + 7 1 1 1
Occipital Cortex Control - 6 0 0 0
ALS + 42 2 0 0
ALS-AD + 7 0 0 0
ALS-FTD + 6 0 0 0
FTD + 1 0 0 0
Other + 4 0 0 0
Temporal Cortex Control - 26 0 0 0
ALS + 22 7 4 2
ALS-AD + 3 0 0 0
ALS-FTD + 3 3 3 1
FTD - 8 1 0 0
+ 33 26 21 16
Other + 2 1 0 0
Thoracic Spinal Cord Control - 8 0 0 0
ALS - 3 0 0 0
+ 38 19 14 5
ALS-AD + 5 2 1 0
ALS-FTD + 6 2 2 1
FTD + 1 0 0 0
Other + 4 0 0 0
Unspecified Motor Cortex Control - 1 0 0 0
ALS + 26 14 12 3
ALS-FTD + 4 3 3 1
Other + 1 0 0 0

What are these “Other” samples that have STMN novel junctions?

stmn2_novel <- filter(stmn2, junction_type %in% c("novel_5SS")) 

stmn2_novel_other <- 
  filter(stmn2_novel, disease == "Other")

stmn2_novel_other %>% filter(counts > 0 ) %>% arrange(desc(counts)) %>% select(sample, tissue, counts, disease_full)
##           sample               tissue counts
## 1 CGND-HRA-00568  Medial Motor Cortex      5
## 2 CGND-HRA-00404           Cerebellum      1
## 3 CGND-HRA-00792       Frontal Cortex      1
## 4 CGND-HRA-00793      Temporal Cortex      1
## 5 CGND-HRA-00796   Lumbar Spinal Cord      1
## 6 CGND-HRA-00569 Lateral Motor Cortex      1
##                      disease_full
## 1 Primary Lateral Sclerosis (PLS)
## 2   Multiple System Atrophy (MSA)
## 3        Progressive Bulbar Palsy
## 4        Progressive Bulbar Palsy
## 5        Progressive Bulbar Palsy
## 6 Primary Lateral Sclerosis (PLS)
all_counts <- stmn2_novel %>% arrange(desc(counts)) %>% select(sample, tissue, counts_tpm, disease_full) %>% pull(counts_tpm)

# hist(all_counts)
# 
# table(all_counts > 0.01)
# 
# 
# table(stmn2_novel$disease, stmn2_novel$counts_tpm > 0)
# table(stmn2_novel$disease, stmn2_novel$counts_tpm > 0.005)
# table(stmn2_novel$disease, stmn2_novel$counts_tpm > 0.01)
# 
# min(stmn2_novel$counts_tpm[ stmn2_novel$counts_tpm > 0])
# max(stmn2_novel$counts_tpm[ stmn2_novel$counts_tpm > 0])

#0.006 JPM = 0.006 reads per 1 million = 1 read per 166 million - 2 samples were sequenced at a depth of >160million reads.

# under a null model, the number of STMN2 junction reads should increase with library size
stmn2_novel %>%
 filter( disease == "ALS", pathology != "ALS-SOD1") %>%
  filter(tissue %in% tissues_to_plot) %>%
  ggplot(aes(x =library_size, counts) ) + geom_point(aes(colour = counts == 0)) +
  facet_wrap(~tissue) +
  labs(title = "All non-SOD1 ALS samples - effect of library size on novel junction") +
  theme_jh()

Some samples were sequenced at double the average sequencing depth. If the assumption is all non-SOD1 ALS samples should have TDP mislocalisation in the spinal cord, does doubling the sequencing depth improve my chance of detecting 1+ STMN2 reads? Yes. In the spinal cord samples, all but 1 of the samples with the high read depth has 1+ STMN2 read. This suggests the lack of completeness across the ALS samples is due to sequencing depth.

Investigating Control samples

5 control samples have a single STMN2 read. 3 of them are Cerebellum. Does this mean let’s accept at least 2 reads as a minimum?

#with(stmn2_novel, table(pathology, true = counts > 1, tissue) )

stmn2_novel %>%
  filter(disease == "Control") %>%
  arrange(desc(counts)) %>% 
  head(10)
##                          junction           sample counts  individual
## 1  chr8:79611214:79616822:clu_1_-   CGND-HRA-00049      1 NEUHC282LVJ
## 2  chr8:79611214:79616822:clu_1_-   CGND-HRA-00593      1 NEUDA151YMK
## 3  chr8:79611214:79616822:clu_1_-   CGND-HRA-01242      1 NEUEG442LDR
## 4  chr8:79611214:79616822:clu_1_-   CGND-HRA-01146      1  PF-UCL-116
## 5  chr8:79611214:79616822:clu_1_-   CGND-HRA-01149      1  PF-UCL-117
## 6  chr8:79611214:79616822:clu_1_- CGND-HRA-00591-2      0 NEUHA141EEC
## 7  chr8:79611214:79616822:clu_1_-   CGND-HRA-00206      0 NEUHD481VCL
## 8  chr8:79611214:79616822:clu_1_-   CGND-HRA-00212      0 NEUHD481VCL
## 9  chr8:79611214:79616822:clu_1_-   CGND-HRA-00218      0 NEUHD481VCL
## 10 chr8:79611214:79616822:clu_1_-   CGND-HRA-00224      0 NEUHD481VCL
##         region               tissue         tissue_clean site rin pmi
## 1  Spinal_Cord Cervical Spinal Cord Cervical_Spinal_Cord UCSD 7.0  10
## 2   Cerebellum           Cerebellum           Cerebellum  JHU 6.9  14
## 3       Cortex       Frontal Cortex       Frontal_Cortex  JHU 7.5   6
## 4   Cerebellum           Cerebellum           Cerebellum  UCL 4.6  13
## 5   Cerebellum           Cerebellum           Cerebellum  UCL 2.7  13
## 6  Spinal_Cord Cervical Spinal Cord Cervical_Spinal_Cord  JHU 5.3  14
## 7   Cerebellum           Cerebellum           Cerebellum CUMC 8.5   7
## 8       Cortex       Frontal Cortex       Frontal_Cortex CUMC 7.5   7
## 9       Cortex  Medial Motor Cortex         Motor_Cortex CUMC 5.7   7
## 10      Cortex Lateral Motor Cortex         Motor_Cortex CUMC 6.5   7
##      platform                  prep      quote      lane   flowcell    sex
## 1  HiSeq 2500 Manual KAPA RiboErase CGND_12856      L001 ACB20PANXX Female
## 2  HiSeq 2500 Manual KAPA RiboErase CGND_13066      L002 ACBRAPANXX Female
## 3  HiSeq 2500 Manual KAPA RiboErase CGND_13132      L002 ACC56PANXX   Male
## 4     NovaSeq  Automated KAPA Total CGND_13716      L001  HHGMFDSXX   Male
## 5     NovaSeq  Automated KAPA Total CGND_13716      L003  HHGMFDSXX Female
## 6     NovaSeq  Automated KAPA Total CGND_13867 L001+L002  HJLM5DMXX   Male
## 7  HiSeq 2500 Manual KAPA RiboErase CGND_12675      L004 BCAYDHANXX Female
## 8  HiSeq 2500 Manual KAPA RiboErase CGND_12675      L004 BCAYDHANXX Female
## 9  HiSeq 2500 Manual KAPA RiboErase CGND_12675      L004 BCAYDHANXX Female
## 10 HiSeq 2500 Manual KAPA RiboErase CGND_12675      L004 BCAYDHANXX Female
##    disease disease_full age onset    motor_onset mutations pathology
## 1  Control         <NA>  68    NA Not Applicable      None   Control
## 2  Control         <NA>  37    NA Not Applicable      None   Control
## 3  Control         <NA>  22    NA Not Applicable      None   Control
## 4  Control         <NA>  88    NA Not Applicable      None   Control
## 5  Control         <NA>  80    NA Not Applicable      None   Control
## 6  Control         <NA>  72    NA Not Applicable      None   Control
## 7  Control         <NA>  54    NA Not Applicable      None   Control
## 8  Control         <NA>  54    NA Not Applicable      None   Control
## 9  Control         <NA>  54    NA Not Applicable      None   Control
## 10 Control         <NA>  54    NA Not Applicable      None   Control
##    library_size      tdp_status disease_tdp junction_type counts_tpm
## 1     113619315 TDP-43 negative     Control     novel_5SS 0.01760264
## 2      77258265 TDP-43 negative     Control     novel_5SS 0.02588720
## 3      83524909 TDP-43 negative     Control     novel_5SS 0.02394495
## 4      88636750 TDP-43 negative     Control     novel_5SS 0.02256400
## 5     116170973 TDP-43 negative     Control     novel_5SS 0.01721600
## 6      82331600 TDP-43 negative     Control     novel_5SS 0.00000000
## 7      70994214 TDP-43 negative     Control     novel_5SS 0.00000000
## 8      88064237 TDP-43 negative     Control     novel_5SS 0.00000000
## 9     109710526 TDP-43 negative     Control     novel_5SS 0.00000000
## 10     89984464 TDP-43 negative     Control     novel_5SS 0.00000000

1 sample with “Other” has 5 STMN2 novel junction reads in the medial motor cortex and a single read in the lateral motor cortex. That individual had a diagnosis of Primary Lateral Sclerosis but died at 56 - potential ALS misdiagnosis?

Another patient has a single read in Frontal, Temporal and Lumbar Spinal Cord, suggestive of ALS-FTD. However this patient was diagnosed as having Progressive Bulbar Palsy and died at 84.

The final patient with an “Other” diagnosis died at 45 of MSA. However the junction was found in Cerebellum, which intriguingly has multiple samples with a single junction in.

Observing the relationship between the two STMN2 junctions

Can this be modelled as a ratio of the two transcripts? For that I would want to assume that the two junctions are negatively correlated - when the amount of novel STMN2 goes up, the amount of canonical STMN2 goes down.

However this relationship only holds if the two isoforms are both widely expressed, which is probably not the case.

# plot correlations between levels
stmn2_wide_tpm <- 
  stmn2 %>%
  select( sample, pathology, disease, counts_tpm, junction_type, tissue) %>%
  tidyr::spread(key = junction_type, value = counts_tpm) 


stmn2_wide_raw <- 
  stmn2 %>%
  select( sample, pathology, disease, counts, junction_type, tissue) %>%
  tidyr::spread(key = junction_type, value = counts) 

# stmn2_wide %>%
#   ggplot(aes(x = `major isoform`, y = `novel 3SS`) ) + geom_point(aes(colour = pathology) ) +
#   theme_jh()

stmn2_wide_raw %>%
  filter( novel_5SS > 1 ) %>%
   filter(tissue %in% c("Frontal Cortex", "Temporal Cortex", "Cervical Spinal Cord", "Lumbar Spinal Cord", "Lateral Motor Cortex","Medial Motor Cortex") ) %>%
  ggplot(aes(x = major, y = novel_5SS) ) + geom_point(aes(colour = disease), size = 1 ) +
  theme_jh() + 
  facet_wrap(~tissue) +
  labs(x = "Canonical junction reads", y ="Novel junction reads") +
  theme(panel.border = element_rect(fill = NA)) +
  ggpubr::stat_cor(method = "spearman") +
  labs(title = "Raw counts of the two STMN2 junctions have tissue specific correlation structure")

# stmn2_wide_tpm %>%
#   filter( novel_5SS > 0.01 ) %>%
#    filter(tissue %in% c("Frontal Cortex", "Temporal Cortex", "Cervical Spinal Cord", "Lumbar Spinal Cord", "Lateral Motor Cortex","Medial Motor Cortex") ) %>%
#   ggplot(aes(x = major, y = novel_5SS) ) + geom_point(aes(colour = disease), size = 1 ) +
#   theme_jh() + 
#   facet_wrap(~tissue) +
#   labs(x = "Canonical junctions per million", y ="Novel junctions per million") +
#   theme(panel.border = element_rect(fill = NA)) +
#   ggpubr::stat_cor(method = "spearman") +
#   labs(title = "Normalised counts of the two STMN2 junctions have tissue specific correlation structure")

# does regressing out variation in major isoform remove novel isoform? probably not.
# 
# stmn2_wide %>%
#   ggplot(aes(x = `major isoform`, y = `novel 5SS`) ) + geom_point(aes(colour = as.factor(fcx_nmf_cluster) ) ) +
#   theme_jh()
# 
# stmn2_wide %>%
#   ggplot(aes(y = `major isoform`, x = pathology) ) + geom_jitter(aes(colour = as.factor(fcx_nmf_cluster)), width = 0.25 ) +
#   theme_jh()
# 
# stmn2_wide %>%
#   ggplot(aes(x = PC1, y = `major isoform`)) + geom_point(aes(colour = fcx_nmf_cluster))
# 
# stmn2_wide %>%
#   ggplot(aes(x = PC1, y = `novel 5SS`)) + geom_point(aes(colour = fcx_nmf_cluster))

Opposite correlations appear, with negative correaltions between STMN2 canonical isoform and the novel isoform in FTD samples in Frontal and Temporal Cortexes. However, There is a positive correlation between the two in ALS samples in the spinal cord. Both these phenomena persist when normalising by library size. In FTD this suggests that STMN2 novel splicing is associated with a reduction in canonical STMN2 expression whereas in ALS the two transcripts potentially act independently, both increasing with the proportion of neurons captured in a sample.

Estimate neuronal proportion in each sample and compare to STMN novel and canonical expression.

Consider collapsing Motor Cortex and Spinal Cord regions down?

Site of Motor onset and STMN2 junctions

# stmn2_novel %>%
#   filter(region == "Spinal_Cord") %>%
#   ggplot(aes(x = motor_onset, y = count))

stmn2_motor_onset <- 
  stmn2 %>%
  filter( junction_type %in% c("novel_5SS")) %>%
  filter(disease == "ALS", motor_onset %in% c("Limb", "Bulbar"), !mutations %in% c("SOD1", "ANG")) %>%
  filter( tissue %in% c("Cervical Spinal Cord", "Lumbar Spinal Cord", "Lateral Motor Cortex", "Medial Motor Cortex") ) %>%
  mutate( stmn2_1 = counts == 1, stmn2_2 = counts >=2 & counts < 5, stmn2_5 = counts >= 5, stmn2_all = counts > 0) %>%
  group_by(tissue, motor_onset ) %>%
  summarise( samples = n(), all = sum(stmn2_all) / n(),  `1` = sum(stmn2_1) / n(), `2-5` = sum(stmn2_2) / n(), `5+` = sum(stmn2_5) / n()  ) %>%
  tidyr::gather(key = "threshold", value = prop, -tissue, -motor_onset, -samples) %>%
  ungroup()

# per tissue proportion test
stmn2_motor_onset  %>%
  filter(threshold == "all") %>%
  mutate( x = prop * samples) %>%
  split(.$tissue) %>%
  purrr::map_df( ~{ tibble(p.value = prop.test(x = .x$x, n = .x$samples)$p.value ) }, .id = "tissue")
## Warning in prop.test(x = .x$x, n = .x$samples): Chi-squared approximation
## may be incorrect

## Warning in prop.test(x = .x$x, n = .x$samples): Chi-squared approximation
## may be incorrect
## # A tibble: 4 x 2
##   tissue               p.value
##   <chr>                  <dbl>
## 1 Cervical Spinal Cord   0.821
## 2 Lateral Motor Cortex   0.585
## 3 Lumbar Spinal Cord     0.440
## 4 Medial Motor Cortex    1.000
# all tissues combined - proportion test
stmn2_motor_onset  %>%
  filter(threshold == "all") %>%
  mutate( x = prop * samples) %>%
  group_by( motor_onset ) %>%
  summarise( n = sum(samples), x = sum(x)) %>%
  with( prop.test(x =x , n = n) )
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  x out of n
## X-squared = 1.5573, df = 1, p-value = 0.2121
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.0482568  0.2470944
## sample estimates:
##    prop 1    prop 2 
## 0.6323529 0.5329341
stmn2_motor_onset %>%
  filter(threshold != "all") %>%
  ggplot(aes(x = motor_onset, y = prop)) + 
  geom_col(aes(alpha = threshold), colour = "white", fill = "blue") +
  facet_wrap(~tissue) +
  theme_jh() + 
  scale_alpha_manual("read\nthreshold",values = c(0.3,0.6,1) ) +
  theme( panel.border = element_rect(fill = NA), axis.line = element_blank() ) +
  labs(title = "STMN2 novel junctions in sporadic ALS patients", x = "Site of motor onset", y = "Percentage of samples") + 
  scale_y_continuous(expand = c(0,0), limits = c(0,1.1),labels = scales::percent )